- Print
- DarkLight
Creating an Automated Feedback Pipeline with LangSmith
Manually analyzing text processed by your language model is useful, but not scalable. Automated metrics offer a solution. By adding these metrics to your LangSmith projects, you can track advanced metrics on your LLM's performance and user inputs directly from the dashboard. model-based feedback monitoring charts If the metrics reveal issues, you can isolate problematic runs for debugging or fine-tuning. This tutorial shows you how to set up an automated feedback pipeline for your language models. Steps: Filter Runs: First, identify the runs you want to evaluate. For details, refer to the Run Filtering Documentation. Define Feedback Logic: Create a chain or function to calculate the feedback metrics.
Send Feedback to LangSmith:
Use the client.create_feedback method to send metrics. Alternatively, use client.evaluate_run, which both evaluates and logs metrics for you. We'll be using LangSmith and the hub APIs, so make sure you have the necessary API keys.
import os
import os
from uuid import uuid4
unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<YOUR-API-KEY>" # Update to your API key
# Used by the agent in this tutorial
os.environ["OPENAI_API_KEY"] = "<YOUR-OPENAI-API-KEY>"
# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# Update with your API key
os.environ["LANGCHAIN_API_KEY"] = "YOUR API KEY"
# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_HUB_API_URL"] = "https://api.hub.langchain.com"
# Update with your Hub API key
os.environ["LANGCHAIN_HUB_API_KEY"] = "YOUR API KEY"
project_name = "YOUR PROJECT NAME" # Change to your project name
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI
# Fetches the latest version of this prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")
llm = ChatOpenAI(
model="gpt-3.5-turbo-16k",
temperature=0,
)
tools = [
DuckDuckGoSearchResults(
name="duck_duck_go"
), # General internet search using DuckDuckGo
]
llm_with_tools = llm.bind_tools(tools)
runnable_agent = (
{
"input": lambda x: x["input"],
"agent_scratchpad": lambda x: format_to_openai_tool_messages(
x["intermediate_steps"]
),
}
| prompt
| llm_with_tools
| OpenAIToolsAgentOutputParser()
)
agent_executor = AgentExecutor(
agent=runnable_agent, tools=tools, handle_parsing_errors=True
)
from langsmith import Client
from datetime import datetime
client = Client()
example_data = [
("Who trained Llama-v2?", "I'm sorry, but I don't have that information."),
(
"When did langchain first announce the hub?",
"LangChain first announced the LangChain Hub on September 5, 2023.",
),
(
"What's LangSmith?",
"LangSmith is a platform developed by LangChain for building production-grade LLM (Language Model) applications. It allows you to debug, test, evaluate, and monitor chains and intelligent agents built on any LLM framework. LangSmith seamlessly integrates with LangChain's open-source framework called LangChain, which is widely used for building applications with LLMs.\n\nLangSmith provides full visibility into model inputs and outputs at every step in the chain of events, making it easier to debug and analyze the behavior of LLM applications. It has been tested with early design partners and on internal workflows, and it has been found to help teams in various ways.\n\nYou can find more information about LangSmith on the official LangSmith documentation [here](https://docs.smith.langchain.com/). Additionally, you can read about the announcement of LangSmith as a unified platform for debugging and testing LLM applications [here](https://blog.langchain.dev/announcing-langsmith/).",
),
(
"What is the langsmith cookbook?",
"I'm sorry, but I couldn't find any information about the \"Langsmith Cookbook\". It's possible that it may not be a well-known cookbook or it may not exist. Could you provide more context or clarify the name?",
),
(
"What is LangChain?",
"I'm sorry, but I couldn't find any information about \"LangChain\". Could you please provide more context or clarify your question?",
),
("When was Llama-v2 released?", "Llama-v2 was released on July 18, 2023."),
]
for input_, output_ in example_data:
client.create_run(
name="ExampleRun",
run_type="chain",
inputs={"input": input_},
outputs={"output": output_},
project_name=project_name,
end_time=datetime.utcnow(),
)
midnight = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
runs = list(
client.list_runs(
project_name=project_name, execution_order=1, start_time=midnight, errors=False
)
)
import textstat
from langsmith.schemas import Run, Example
from langchain.schema.runnable import RunnableLambda
def compute_stats(run: Run) -> None:
# Note: your chain's runs may have different keys.
# Be sure to select the right field(s) to measure!
if "input" not in run.inputs:
return
if run.feedback_stats and "smog_index" in run.feedback_stats:
# If we are running this pipeline multiple times
return
text = run.inputs["input"]
try:
fns = [
"flesch_reading_ease",
"flesch_kincaid_grade",
"smog_index",
"coleman_liau_index",
"automated_readability_index",
]
metrics = {fn: getattr(textstat, fn)(text) for fn in fns}
for key, value in metrics.items():
client.create_feedback(
run.id,
key=key,
score=value, # The numeric score is used in the monitoring charts
feedback_source_type="model",
)
except:
pass
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import collect_runs
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
chain = (
prompt
| ChatOpenAI(model="gpt-3.5-turbo", temperature=1).bind(
functions=[
{
"name": "submit_scores",
"description": "Submit the graded scores for a user question and bot response.",
"parameters": {
"type": "object",
"properties": {
"relevance": {
"type": "integer",
"minimum": 0,
"maximum": 5,
"description": "Score indicating the relevance of the question to LangChain/LangSmith.",
},
"difficulty": {
"type": "integer",
"minimum": 0,
"maximum": 5,
"description": "Score indicating the complexity or difficulty of the question.",
},
"verbosity": {
"type": "integer",
"minimum": 0,
"maximum": 5,
"description": "Score indicating how verbose the question is.",
},
"specificity": {
"type": "integer",
"minimum": 0,
"maximum": 5,
"description": "Score indicating how specific the question is.",
},
},
"required": ["relevance", "difficulty", "verbosity", "specificity"],
},
}
]
)
| JsonOutputFunctionsParser()
)
def evaluate_run(run: Run) -> None:
try:
# Note: your chain's runs may have different keys.
# Be sure to select the right field(s) to measure!
if "input" not in run.inputs or not run.outputs or "output" not in run.outputs:
return
if run.feedback_stats and "specificity" in run.feedback_stats:
# We have already scored this run
# (if you're running this pipeline multiple times)
return
with collect_runs() as cb:
result = chain.invoke(
{
"question": run.inputs["input"][:3000], # lazy truncation
"prediction": run.outputs["output"][:3000],
},
)
for feedback_key, value in result.items():
score = int(value) / 5
client.create_feedback(
run.id,
key=feedback_key,
score=score,
source_run_id=cb.traced_runs[0].id,
feedback_source_type="model",
)
except Exception as e:
pass
wrapped_function = RunnableLambda(evaluate_run)
prompt = hub.pull(
"wfh/automated-feedback-example", api_url="https://api.hub.langchain.com"
)
from typing import Optional
from langchain import evaluation, callbacks
from langsmith import evaluation as ls_evaluation
class CompletenessEvaluator(ls_evaluation.RunEvaluator):
def __init__(self):
criteria_description = (
"Does the answer provide sufficient and complete information"
"to fully address all aspects of the question (Y)?"
" Or does it lack important details (N)?"
)
self.evaluator = evaluation.load_evaluator(
"criteria", criteria={"completeness": criteria_description}
)
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> ls_evaluation.EvaluationResult:
if (
not run.inputs
or not run.inputs.get("input")
or not run.outputs
or not run.outputs.get("output")
):
return ls_evaluation.EvaluationResult(key="completeness", score=None)
question = run.inputs["input"]
prediction = run.outputs["output"]
with callbacks.collect_runs() as cb:
result = self.evaluator.evaluate_strings(
input=question, prediction=prediction
)
run_id = cb.traced_runs[0].id
return ls_evaluation.EvaluationResult(
key="completeness", evaluator_info={"__run": {"run_id": run_id}}, **result
)
from typing import Optional
from langchain.evaluation import load_evaluator
from langsmith.evaluation import RunEvaluator, EvaluationResult
from langsmith.schemas import Run, Example
class HelpfulnessEvaluator(RunEvaluator):
def __init__(self):
self.evaluator = load_evaluator(
"score_string", criteria="helpfulness", normalize_by=10
)
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
if (
not run.inputs
or not run.inputs.get("input")
or not run.outputs
or not run.outputs.get("output")
):
return EvaluationResult(key="helpfulness", score=None)
result = self.evaluator.evaluate_strings(
input=run.inputs["input"], prediction=run.outputs["output"]
)
return EvaluationResult(
**{"key": "helpfulness", "comment": result.get("reasoning"), **result}
)